// Copyright 2014 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

// All data that is passed through a WebSocket with type "Text" needs to be
// validated as UTF8. Since this is done on the IO thread, it needs to be
// reasonably fast.

// We are only interested in the performance on valid UTF8. Invalid UTF8 will
// result in a connection failure, so is unlikely to become a source of
// performance issues.

#include "base/i18n/streaming_utf8_validator.h"

#include <stddef.h>

#include <string>

#include "base/bind.h"
#include "base/callback.h"
#include "base/macros.h"
#include "base/strings/string_util.h"
#include "base/strings/stringprintf.h"
#include "base/test/perf_time_logger.h"
#include "testing/gtest/include/gtest/gtest.h"

namespace base {
namespace {

    // We want to test ranges of valid UTF-8 sequences. These ranges are inclusive.
    // They are intended to be large enough that the validator needs to do
    // meaningful work while being in some sense "realistic" (eg. control characters
    // are not included).
    const char kOneByteSeqRangeStart[] = " "; // U+0020
    const char kOneByteSeqRangeEnd[] = "~"; // U+007E

    const char kTwoByteSeqRangeStart[] = "\xc2\xa0"; // U+00A0 non-breaking space
    const char kTwoByteSeqRangeEnd[] = "\xc9\x8f"; // U+024F small y with stroke

    const char kThreeByteSeqRangeStart[] = "\xe3\x81\x82"; // U+3042 Hiragana "a"
    const char kThreeByteSeqRangeEnd[] = "\xe9\xbf\x83"; // U+9FC3 "to blink"

    const char kFourByteSeqRangeStart[] = "\xf0\xa0\x80\x8b"; // U+2000B
    const char kFourByteSeqRangeEnd[] = "\xf0\xaa\x9a\xb2"; // U+2A6B2

    // The different lengths of strings to test.
    const size_t kTestLengths[] = { 1, 32, 256, 32768, 1 << 20 };

    // Simplest possible byte-at-a-time validator, to provide a baseline
    // for comparison. This is only tried on 1-byte UTF-8 sequences, as
    // the results will not be meaningful with sequences containing
    // top-bit-set bytes.
    bool IsString7Bit(const std::string& s)
    {
        for (std::string::const_iterator it = s.begin(); it != s.end(); ++it) {
            if (*it & 0x80)
                return false;
        }
        return true;
    }

    // Assumes that |previous| is a valid UTF-8 sequence, and attempts to return
    // the next one. Is just barely smart enough to iterate through the ranges
    // defined about.
    std::string NextUtf8Sequence(const std::string& previous)
    {
        DCHECK(StreamingUtf8Validator::Validate(previous));
        std::string next = previous;
        for (int i = static_cast<int>(previous.length() - 1); i >= 0; --i) {
            // All bytes in a UTF-8 sequence except the first one are
            // constrained to the range 0x80 to 0xbf, inclusive. When we
            // increment past 0xbf, we carry into the previous byte.
            if (i > 0 && next[i] == '\xbf') {
                next[i] = '\x80';
                continue; // carry
            }
            ++next[i];
            break; // no carry
        }
        DCHECK(StreamingUtf8Validator::Validate(next))
            << "Result \"" << next << "\" failed validation";
        return next;
    }

    typedef bool (*TestTargetType)(const std::string&);

    // Run fuction |target| over |test_string| |times| times, and report the results
    // using |description|.
    bool RunTest(const std::string& description,
        TestTargetType target,
        const std::string& test_string,
        int times)
    {
        base::PerfTimeLogger timer(description.c_str());
        bool result = true;
        for (int i = 0; i < times; ++i) {
            result = target(test_string) && result;
        }
        timer.Done();
        return result;
    }

    // Construct a string by repeating |input| enough times to equal or exceed
    // |length|.
    std::string ConstructRepeatedTestString(const std::string& input,
        size_t length)
    {
        std::string output = input;
        while (output.length() * 2 < length) {
            output += output;
        }
        if (output.length() < length) {
            output += ConstructRepeatedTestString(input, length - output.length());
        }
        return output;
    }

    // Construct a string by expanding the range of UTF-8 sequences
    // between |input_start| and |input_end|, inclusive, and then
    // repeating the resulting string until it equals or exceeds |length|
    // bytes. |input_start| and |input_end| must be valid UTF-8
    // sequences.
    std::string ConstructRangedTestString(const std::string& input_start,
        const std::string& input_end,
        size_t length)
    {
        std::string output = input_start;
        std::string input = input_start;
        while (output.length() < length && input != input_end) {
            input = NextUtf8Sequence(input);
            output += input;
        }
        if (output.length() < length) {
            output = ConstructRepeatedTestString(output, length);
        }
        return output;
    }

    struct TestFunctionDescription {
        TestTargetType function;
        const char* function_name;
    };

    bool IsStringUTF8(const std::string& str)
    {
        return base::IsStringUTF8(base::StringPiece(str));
    }

    // IsString7Bit is intentionally placed last so it can be excluded easily.
    const TestFunctionDescription kTestFunctions[] = {
        { &StreamingUtf8Validator::Validate, "StreamingUtf8Validator" },
        { &IsStringUTF8, "IsStringUTF8" }, { &IsString7Bit, "IsString7Bit" }
    };

    // Construct a test string from |construct_test_string| for each of the lengths
    // in |kTestLengths| in turn. For each string, run each test in |test_functions|
    // for a number of iterations such that the total number of bytes validated
    // is around 16MB.
    void RunSomeTests(
        const char format[],
        base::Callback<std::string(size_t length)> construct_test_string,
        const TestFunctionDescription* test_functions,
        size_t test_count)
    {
        for (size_t i = 0; i < arraysize(kTestLengths); ++i) {
            const size_t length = kTestLengths[i];
            const std::string test_string = construct_test_string.Run(length);
            const int real_length = static_cast<int>(test_string.length());
            const int times = (1 << 24) / real_length;
            for (size_t test_index = 0; test_index < test_count; ++test_index) {
                EXPECT_TRUE(RunTest(StringPrintf(format,
                                        test_functions[test_index].function_name,
                                        real_length,
                                        times),
                    test_functions[test_index].function,
                    test_string,
                    times));
            }
        }
    }

    TEST(StreamingUtf8ValidatorPerfTest, OneByteRepeated)
    {
        RunSomeTests("%s: bytes=1 repeated length=%d repeat=%d",
            base::Bind(ConstructRepeatedTestString, kOneByteSeqRangeStart),
            kTestFunctions,
            3);
    }

    TEST(StreamingUtf8ValidatorPerfTest, OneByteRange)
    {
        RunSomeTests("%s: bytes=1 ranged length=%d repeat=%d",
            base::Bind(ConstructRangedTestString,
                kOneByteSeqRangeStart,
                kOneByteSeqRangeEnd),
            kTestFunctions,
            3);
    }

    TEST(StreamingUtf8ValidatorPerfTest, TwoByteRepeated)
    {
        RunSomeTests("%s: bytes=2 repeated length=%d repeat=%d",
            base::Bind(ConstructRepeatedTestString, kTwoByteSeqRangeStart),
            kTestFunctions,
            2);
    }

    TEST(StreamingUtf8ValidatorPerfTest, TwoByteRange)
    {
        RunSomeTests("%s: bytes=2 ranged length=%d repeat=%d",
            base::Bind(ConstructRangedTestString,
                kTwoByteSeqRangeStart,
                kTwoByteSeqRangeEnd),
            kTestFunctions,
            2);
    }

    TEST(StreamingUtf8ValidatorPerfTest, ThreeByteRepeated)
    {
        RunSomeTests(
            "%s: bytes=3 repeated length=%d repeat=%d",
            base::Bind(ConstructRepeatedTestString, kThreeByteSeqRangeStart),
            kTestFunctions,
            2);
    }

    TEST(StreamingUtf8ValidatorPerfTest, ThreeByteRange)
    {
        RunSomeTests("%s: bytes=3 ranged length=%d repeat=%d",
            base::Bind(ConstructRangedTestString,
                kThreeByteSeqRangeStart,
                kThreeByteSeqRangeEnd),
            kTestFunctions,
            2);
    }

    TEST(StreamingUtf8ValidatorPerfTest, FourByteRepeated)
    {
        RunSomeTests("%s: bytes=4 repeated length=%d repeat=%d",
            base::Bind(ConstructRepeatedTestString, kFourByteSeqRangeStart),
            kTestFunctions,
            2);
    }

    TEST(StreamingUtf8ValidatorPerfTest, FourByteRange)
    {
        RunSomeTests("%s: bytes=4 ranged length=%d repeat=%d",
            base::Bind(ConstructRangedTestString,
                kFourByteSeqRangeStart,
                kFourByteSeqRangeEnd),
            kTestFunctions,
            2);
    }

} // namespace
} // namespace base
